In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline
In [2]:
os.listdir(os.getcwd())
Out[2]:
In [3]:
# load data in
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
In [4]:
train.head(5)
Out[4]:
In [5]:
# initial look at the data
print(train.describe())
print(train.dtypes)
In [6]:
# quite a fair bit of missing values
train.isnull().sum()
Out[6]:
In [7]:
# start with sex
train.Sex.value_counts()
Out[7]:
In [8]:
# convert sex to 1 (male) and 0 (female)
def sexconverter(row):
if row['Sex'] == 'male':
return 1
else:
return 0
train['Sex'] = train.apply(sexconverter, axis=1)
In [9]:
# only 7 observations of less than 1 year old
train[train.Age < 1]
Out[9]:
In [10]:
# look at fare
# significant non-normality and right skewnewss
plt.figure(figsize=(10,10))
sns.distplot(train.Fare)
Out[10]:
In [11]:
# fare and survival rate?
# already we can see that mainly men did not survive!
plt.figure(figsize=(10,10))
sns.swarmplot(x='Survived', y='Fare', hue='Sex', data=train)
Out[11]:
In [12]:
# look at gender survival rates
tmp = pd.crosstab(index=train.Sex, columns=train.Survived, margins=True)
tmp
Out[12]:
In [13]:
# frequency?
tmp_freq = pd.crosstab(index=train.Sex, columns=train.Survived, margins=True, normalize="index")
tmp_freq
Out[13]:
In [14]:
# how about pclass and survival rates?
# use heatmap - mostly lower class people did not survive
plt.figure(figsize=(10,10))
tmp = pd.crosstab(index=train.Pclass, columns=[train.Survived, train.Sex])
sns.heatmap(tmp, cmap="plasma")
Out[14]:
In [15]:
# how about age?
plt.figure(figsize=(10,10))
sns.violinplot(x="Survived", y="Age", data=train)
sns.swarmplot(x="Survived", y="Age", hue="Sex", alpha=0.5, data=train)
Out[15]:
In [34]:
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
import numpy as np
from sklearn.model_selection import train_test_split
In [17]:
clf = GaussianNB()
In [ ]:
# impute for missing Age values
train['Age'] = [np.mean(train.Age) if np.isnan(x) == True else x for x in train.Age]
In [29]:
X = train[['Sex','Age','Pclass']].values
y= train[['Survived']].values
In [30]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)
In [32]:
clf.fit(X_train, y_train.ravel())
Out[32]:
In [40]:
acc = metrics.accuracy_score(y_test, clf.predict(X_test))
print("Accuracy of GNB model is %.2f%%" % (acc*100))
In [41]:
# plot ROC curve
probs = clf.predict_proba(X_test)
preds = probs[:,-1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr,tpr)
In [49]:
plt.figure(figsize=(15,15))
axis_font = {'fontname':'Arial', 'size':'22'}
plt.plot(fpr,tpr, 'b', label="ROC curve(area=%0.2f)" % roc_auc)
plt.plot([0,1],[0,1], "r--")
plt.xlabel("False Positive Rate",**axis_font)
plt.ylabel("True Positive Rate",**axis_font)
plt.legend(loc="lower right")
Out[49]: